Preparation

0. Import libraries and datasets
library(data.table)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(stats)
library(tidyverse)
## ─ Attaching packages ──────────────────────────── tidyverse 1.3.0 ─
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.4
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ─ Conflicts ───────────────────────────── tidyverse_conflicts() ─
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
options("scipen" = 100, "digits" = 4)

machine_data = fread('/Users/robinho/Nutstore Files/01 ESADE/Study/05 MSc Business Analytics 19_20/04 Term2/01 R language/Data Analytics with R/Challenge/Challenge 2/DATA/machine_data.csv')
product_data = fread('/Users/robinho/Nutstore Files/01 ESADE/Study/05 MSc Business Analytics 19_20/04 Term2/01 R language/Data Analytics with R/Challenge/Challenge 2/DATA/product_data.csv')
transactional_data = fread('/Users/robinho/Nutstore Files/01 ESADE/Study/05 MSc Business Analytics 19_20/04 Term2/01 R language/Data Analytics with R/Challenge/Challenge 2/DATA/transactional_data.csv')
machine_failures = fread('/Users/robinho/Nutstore Files/01 ESADE/Study/05 MSc Business Analytics 19_20/04 Term2/01 R language/Data Analytics with R/Challenge/Challenge 2/DATA/machine_failures.csv')

I. Creating the relevant variable of the model

1. Merge the transactional dataset with the machine failures data set setting failure variable to 0 when no failure is recorded.

# 1-1. Merge machine_faliures with transactional data
complete_data = merge(transactional_data,
                      machine_failures,
                      by=c("machine", "timestamp", "column"),
                      all.x=T)

# 1-2. Set all the cols in "failure" for days without failure
complete_data[is.na(failure), failure:=0]
complete_data

2. In the transactional data table, create a variable called last_vend containing the timestamp of the previous sale of each machine.

# 2-1. Order the table
complete_data[order(machine, timestamp)]
# 2-2. Shift the column 'timestamp' to create the column 'last_vend'
complete_data[, last_vend := shift(timestamp,1), by = machine]
complete_data

3. Create a new variable in the transactional data table called deltahours containing, for every sale, the hours that passed since the last sale.

complete_data[, deltahours := difftime(timestamp, last_vend, units='hours')]
complete_data

4. Create an auxiliary data table called machine_daily_average with the average daily sales per machine. Use this auxiliary table to attach to every row of the transactional data table the the average daily sales per machine. You can do this by doing a merge.

# 4-1. Sales per machine
    # Comments: To calculate the transactions per machine, we consider active days to be also the ones where the machine have a failure.
num_prod = complete_data[, .(num_prod = .N), by = "machine"]
head(num_prod)
# 4-2. Number of days that each machine was working
working_days = complete_data[, .(num_days = uniqueN(date)), by = "machine"]
head(working_days)
# 4-3. Average daily sales per machine
machine_daily_average = working_days[,
                                     .(daily_sales_machine = num_prod$num_prod/num_days,
                                       machine)]

# 4-4. Get the complete_data with machine_daily_average column
complete_data = merge(complete_data,
                      machine_daily_average,
                      by = "machine")
complete_data

5. Create a new variable called delta in the transactional data table containing a normalized version of deltahours consisting on the deltahours associated with each sale divided by the average deltahours of each machine i.e. delta = deltahours /(24/daily_sales_machine). The interpretation of delta is the amount of missed sales if the machine was selling at a constant rate.

complete_data[, delta := as.numeric(deltahours) / (24 / daily_sales_machine)]
Manipulated dataset
complete_data

II. Creating the model

6. Select 30% of the machines in the transactional data for testing and 70% of the machines for training and train a linear logistic regression model called m to predict whether a machine has a failure as a function of variable delta.

# 6-1. Set the machine list
machine_list = unique(complete_data$machine)

# 6-2. Extract the training machine list & dataset
set.seed(5)
training_machine_list = sample(machine_list,
                               round(0.7*length(machine_list), 0),
                               replace = F)
train = complete_data[machine %in% training_machine_list]

# 6-3. Extract the test machine list & dataset
test_machine_list = setdiff(machine_list, training_machine_list)
test = complete_data[machine %in% test_machine_list]

# 6-4. Build the model
m = glm(failure ~ delta, data = train, family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
What is the value of the intercept and the coefficient accompanying variable delta?
# 6-5. Model information
summary(m)
## 
## Call:
## glm(formula = failure ~ delta, family = "binomial", data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.037  -0.062  -0.048  -0.045   3.718  
## 
## Coefficients:
##             Estimate Std. Error z value            Pr(>|z|)    
## (Intercept) -6.91221    0.02307    -300 <0.0000000000000002 ***
## delta        0.56273    0.00314     179 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 103916  on 1287048  degrees of freedom
## Residual deviance:  58491  on 1287047  degrees of freedom
##   (1746 observations deleted due to missingness)
## AIC: 58495
## 
## Number of Fisher Scoring iterations: 9
  • Comments: 1) Value of intercept: -6.91221; 2) Value of delta: 0.56273.